/* The copyright in this software is being made available under the BSD
 * License, included below. This software may be subject to other third party
 * and contributor rights, including patent rights, and no such rights are
 * granted under this license.
 *
 * Copyright (c) 2010-2024, ITU/ISO/IEC
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 *  * Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 *  * Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *  * Neither the name of the ITU/ISO/IEC nor the names of its contributors may
 *    be used to endorse or promote products derived from this software without
 *    specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
 * THE POSSIBILITY OF SUCH DAMAGE.
 */

/** \file     QuantRDOQ.cpp
    \brief    transform and quantization class
*/

#include "QuantRDOQ.h"

#include "UnitTools.h"
#include "ContextModelling.h"
#include "CodingStructure.h"

#include "dtrace_next.h"
#include "dtrace_buffer.h"

#include <stdlib.h>
#include <limits>
#include <memory.h>


struct coeffGroupRDStats
{
  int    iNNZbeforePos0;
  double d64CodedLevelandDist; // distortion and level cost only
  double d64UncodedDist;    // all zero coded block distortion
  double d64SigCost;
  double d64SigCost_0;
  int   iNumSbbCtxBins;
};


//! \ingroup CommonLib
//! \{

// ====================================================================================================================
// Constants
// ====================================================================================================================


// ====================================================================================================================
// Static functions
// ====================================================================================================================

// ====================================================================================================================
// QuantRDOQ class member functions
// ====================================================================================================================


QuantRDOQ::QuantRDOQ( const Quant* other ) : Quant( other )
{

  const QuantRDOQ *rdoq = dynamic_cast<const QuantRDOQ*>( other );
  CHECK( other && !rdoq, "The RDOQ cast must be successfull!" );
  xInitScalingList( rdoq );
}

QuantRDOQ::~QuantRDOQ()
{
  xDestroyScalingList();
}


/** Get the best level in RD sense
 *
 * \returns best quantized transform level for given scan position
 *
 * This method calculates the best quantized transform level for a given scan position.
 */
inline uint32_t QuantRDOQ::xGetCodedLevel( double&            rd64CodedCost,
                                       double&            rd64CodedCost0,
                                       double&            rd64CodedCostSig,
                                       Intermediate_Int   lLevelDouble,
                                       uint32_t               uiMaxAbsLevel,
                                       const BinFracBits* fracBitsSig,
                                       const BinFracBits& fracBitsPar,
                                       const BinFracBits& fracBitsGt1,
                                       const BinFracBits& fracBitsGt2,
                                       const int          remRegBins,
                                       unsigned           goRiceZero,
                                       uint16_t             ui16AbsGoRice,
                                       int                iQBits,
                                       double             errorScale,
                                       bool               bLast,
                                       bool               useLimitedPrefixLength,
                                       const int          maxLog2TrDynamicRange
                                     ) const
{
  double dCurrCostSig   = 0;
  uint32_t   uiBestAbsLevel = 0;

  if( !bLast && uiMaxAbsLevel < 3 )
  {
    rd64CodedCostSig    = xGetRateSigCoef( *fracBitsSig, 0 );
    rd64CodedCost       = rd64CodedCost0 + rd64CodedCostSig;
    if( uiMaxAbsLevel == 0 )
    {
      return uiBestAbsLevel;
    }
  }
  else
  {
    rd64CodedCost       = MAX_DOUBLE;
  }

  if( !bLast )
  {
    dCurrCostSig        = xGetRateSigCoef( *fracBitsSig, 1 );
  }

  uint32_t uiMinAbsLevel    = ( uiMaxAbsLevel > 1 ? uiMaxAbsLevel - 1 : 1 );
  for( int uiAbsLevel  = uiMaxAbsLevel; uiAbsLevel >= uiMinAbsLevel ; uiAbsLevel-- )
  {
    double dErr         = double( lLevelDouble  - ( Intermediate_Int(uiAbsLevel) << iQBits ) );

    double dCurrCost    = dErr * dErr * errorScale + xGetICost( xGetICRate( uiAbsLevel, fracBitsPar, fracBitsGt1, fracBitsGt2, remRegBins, goRiceZero, ui16AbsGoRice, true, maxLog2TrDynamicRange ) );
    dCurrCost          += dCurrCostSig;

    if( dCurrCost < rd64CodedCost )
    {
      uiBestAbsLevel    = uiAbsLevel;
      rd64CodedCost     = dCurrCost;
      rd64CodedCostSig  = dCurrCostSig;
    }
  }

  return uiBestAbsLevel;
}

/** Calculates the cost for specific absolute transform level
 * \param uiAbsLevel scaled quantized level
 * \param ui16CtxNumOne current ctxInc for coeff_abs_level_greater1 (1st bin of coeff_abs_level_minus1 in AVC)
 * \param ui16CtxNumAbs current ctxInc for coeff_abs_level_greater2 (remaining bins of coeff_abs_level_minus1 in AVC)
 * \param ui16AbsGoRice Rice parameter for coeff_abs_level_minus3
 * \param c1Idx
 * \param c2Idx
 * \param useLimitedPrefixLength
 * \param maxLog2TrDynamicRange
 * \returns cost of given absolute transform level
 */
inline int QuantRDOQ::xGetICRate( const uint32_t         uiAbsLevel,
                                  const BinFracBits& fracBitsPar,
                                  const BinFracBits& fracBitsGt1,
                                  const BinFracBits& fracBitsGt2,
                                  const int          remRegBins,
                                  unsigned           goRiceZero,
                                  const uint16_t       ui16AbsGoRice,
                                  const bool         useLimitedPrefixLength,
                                  const int          maxLog2TrDynamicRange  ) const
{
  if( remRegBins < 4 )
  {
    int       iRate   = int( xGetIEPRate() ); // cost of sign bit
    uint32_t  symbol  = ( uiAbsLevel == 0 ? goRiceZero : uiAbsLevel <= goRiceZero ? uiAbsLevel-1 : uiAbsLevel );
    uint32_t  length;
    const int threshold = COEF_REMAIN_BIN_REDUCTION;
    if( symbol < ( threshold << ui16AbsGoRice ) )
    {
      length = symbol >> ui16AbsGoRice;
      iRate += ( length + 1 + ui16AbsGoRice ) << SCALE_BITS;
    }
    else if( useLimitedPrefixLength )
    {
      const uint32_t maximumPrefixLength = ( 32 - ( COEF_REMAIN_BIN_REDUCTION + maxLog2TrDynamicRange ) );

      uint32_t prefixLength = 0;
      uint32_t suffix = ( symbol >> ui16AbsGoRice ) - COEF_REMAIN_BIN_REDUCTION;

      while( ( prefixLength < maximumPrefixLength ) && ( suffix > ( ( 2 << prefixLength ) - 2 ) ) )
      {
        prefixLength++;
      }

      const uint32_t suffixLength = ( prefixLength == maximumPrefixLength ) ? ( maxLog2TrDynamicRange - ui16AbsGoRice ) : ( prefixLength + 1/*separator*/ );

      iRate += ( COEF_REMAIN_BIN_REDUCTION + prefixLength + suffixLength + ui16AbsGoRice ) << SCALE_BITS;
    }
    else
    {
      length = ui16AbsGoRice;
      symbol = symbol - ( threshold << ui16AbsGoRice );
      while( symbol >= ( 1 << length ) )
      {
        symbol -= ( 1 << ( length++ ) );
      }
      iRate += ( threshold + length + 1 - ui16AbsGoRice + length ) << SCALE_BITS;
    }
    return iRate;
  }

  int iRate = int( xGetIEPRate() ); // cost of sign bit
  const uint32_t cthres = 4;
  if( uiAbsLevel >= cthres )
  {
    uint32_t symbol = ( uiAbsLevel - cthres ) >> 1;
    uint32_t length;
    const int threshold = COEF_REMAIN_BIN_REDUCTION;
    if( symbol < ( threshold << ui16AbsGoRice ) )
    {
      length = symbol >> ui16AbsGoRice;
      iRate += ( length + 1 + ui16AbsGoRice ) << SCALE_BITS;
    }
    else if( useLimitedPrefixLength )
    {
      const uint32_t maximumPrefixLength = ( 32 - ( COEF_REMAIN_BIN_REDUCTION + maxLog2TrDynamicRange ) );

      uint32_t prefixLength = 0;
      uint32_t suffix = ( symbol >> ui16AbsGoRice ) - COEF_REMAIN_BIN_REDUCTION;

      while( ( prefixLength < maximumPrefixLength ) && ( suffix > ( ( 2 << prefixLength ) - 2 ) ) )
      {
        prefixLength++;
      }

      const uint32_t suffixLength = ( prefixLength == maximumPrefixLength ) ? ( maxLog2TrDynamicRange - ui16AbsGoRice ) : ( prefixLength + 1/*separator*/ );

      iRate += ( COEF_REMAIN_BIN_REDUCTION + prefixLength + suffixLength + ui16AbsGoRice ) << SCALE_BITS;
    }
    else
    {
      length = ui16AbsGoRice;
      symbol = symbol - ( threshold << ui16AbsGoRice );
      while( symbol >= ( 1 << length ) )
      {
        symbol -= ( 1 << ( length++ ) );
      }
      iRate += ( threshold + length + 1 - ui16AbsGoRice + length ) << SCALE_BITS;
    }

    iRate += fracBitsGt1.intBits[1];
    iRate += fracBitsPar.intBits[( uiAbsLevel - 2 ) & 1];
    iRate += fracBitsGt2.intBits[1];
  }
  else if( uiAbsLevel == 1 )
  {
    iRate += fracBitsGt1.intBits[0];
  }
  else if( uiAbsLevel == 2 )
  {
    iRate += fracBitsGt1.intBits[1];
    iRate += fracBitsPar.intBits[0];
    iRate += fracBitsGt2.intBits[0];
  }
  else if( uiAbsLevel == 3 )
  {
    iRate += fracBitsGt1.intBits[1];
    iRate += fracBitsPar.intBits[1];
    iRate += fracBitsGt2.intBits[0];
  }
  else
  {
    iRate = 0;
  }
  return  iRate;
}

inline double QuantRDOQ::xGetRateSigCoeffGroup( const BinFracBits& fracBitsSigCG, unsigned uiSignificanceCoeffGroup ) const
{
  return xGetICost( fracBitsSigCG.intBits[uiSignificanceCoeffGroup] );
}

/** Calculates the cost of signaling the last significant coefficient in the block
 * \param uiPosX X coordinate of the last significant coefficient
 * \param uiPosY Y coordinate of the last significant coefficient
 * \param component colour component ID
 * \returns cost of last significant coefficient
 */
/*
 * \param uiWidth width of the transform unit (TU)
*/
inline double QuantRDOQ::xGetRateLast( const int* lastBitsX, const int* lastBitsY, unsigned PosX, unsigned PosY ) const
{
  uint32_t ctxX = g_groupIdx[PosX];
  uint32_t ctxY = g_groupIdx[PosY];
  double   cost = lastBitsX[ctxX] + lastBitsY[ctxY];
  if (ctxX > 3)
  {
    cost += xGetIEPRate() * ((ctxX - 2) >> 1);
  }
  if (ctxY > 3)
  {
    cost += xGetIEPRate() * ((ctxY - 2) >> 1);
  }
  return xGetICost(cost);
}


inline double QuantRDOQ::xGetRateSigCoef( const BinFracBits& fracBitsSig, unsigned uiSignificance ) const
{
  return xGetICost( fracBitsSig.intBits[uiSignificance] );
}

/** Get the cost for a specific rate
 * \param dRate rate of a bit
 * \returns cost at the specific rate
 */
inline double QuantRDOQ::xGetICost        ( double                          dRate         ) const
{
  return m_dLambda * dRate;
}

/** Get the cost of an equal probable bit
 * \returns cost of equal probable bit
 */
inline double QuantRDOQ::xGetIEPRate      (                                               ) const
{
  return 32768;
}



/** set quantized matrix coefficient for encode
 * \param scalingList            quantized matrix address
 * \param format                 chroma format
 * \param maxLog2TrDynamicRange
 * \param bitDepths              reference to bit depth array for all channels
 */
void QuantRDOQ::setScalingList(ScalingList *scalingList, const int maxLog2TrDynamicRange[MAX_NUM_CHANNEL_TYPE], const BitDepths &bitDepths)
{
  Quant::setScalingList( scalingList, maxLog2TrDynamicRange, bitDepths );

  const int minimumQp = 0;
  const int maximumQp = SCALING_LIST_REM_NUM;

  for(uint32_t size = 0; size < SCALING_LIST_SIZE_NUM; size++)
  {
    for(uint32_t list = 0; list < SCALING_LIST_NUM; list++)
    {
      for(int qp = minimumQp; qp < maximumQp; qp++)
      {
//         xSetScalingListEnc(scalingList,list,size,qp);
//         xSetScalingListDec(*scalingList,list,size,qp);
        xSetErrScaleCoeff(list,size, size,qp,maxLog2TrDynamicRange, bitDepths);
      }
    }
  }
}


double QuantRDOQ::xGetErrScaleCoeff(const bool needsSqrt2, SizeType width, SizeType height, int qp, const int maxLog2TrDynamicRange, const int channelBitDepth, bool bTransformSkip=false)
{
  const int iTransformShift = bTransformSkip ? 0 : getTransformShift(channelBitDepth, Size(width, height), maxLog2TrDynamicRange);
  double    dErrScale = (double)(1 << SCALE_BITS);                                // Compensate for scaling of bitcount in Lagrange cost function
  double    dTransShift = (double)iTransformShift + (needsSqrt2 ? -0.5 : 0.0);
  dErrScale = dErrScale * pow(2.0, (-2.0*dTransShift));                     // Compensate for scaling through forward transform
  const int  QStep = g_quantScales[needsSqrt2 ? 1 : 0][qp];
  double    finalErrScale = dErrScale / QStep / QStep / (1 << (DISTORTION_PRECISION_ADJUSTMENT(channelBitDepth) << 1));
  return    finalErrScale;
}



/** set error scale coefficients
 * \param list                   list ID
 * \param size
 * \param qp                     quantization parameter
 * \param maxLog2TrDynamicRange
 * \param bitDepths              reference to bit depth array for all channels
 */
void QuantRDOQ::xSetErrScaleCoeff( uint32_t list, uint32_t sizeX, uint32_t sizeY, int qp, const int maxLog2TrDynamicRange[MAX_NUM_CHANNEL_TYPE], const BitDepths &bitDepths )
{
  const int width = g_scalingListSizeX[sizeX];
  const int height = g_scalingListSizeX[sizeY];
  const ChannelType channelType =
    ((list == 0) || (list == MAX_NUM_COMPONENT)) ? ChannelType::LUMA : ChannelType::CHROMA;
  const int channelBitDepth = bitDepths[channelType];
  const int iTransformShift = getTransformShift(
    channelBitDepth, Size(g_scalingListSizeX[sizeX], g_scalingListSizeX[sizeY]),
    maxLog2TrDynamicRange[to_underlying(channelType)]);   // Represents scaling through forward transform

  uint32_t i, uiMaxNumCoeff = width * height;
  int *piQuantcoeff;
  double *pdErrScale;
  piQuantcoeff = getQuantCoeff( list, qp, sizeX, sizeY );
  pdErrScale   = xGetErrScaleCoeffSL( list, sizeX, sizeY, qp);

  double dErrScale = (double)( 1 << SCALE_BITS );                                // Compensate for scaling of bitcount in Lagrange cost function

  const bool needsSqrt2 = ((floorLog2(width) + floorLog2(height)) & 1) == 1;
  double dTransShift = (double)iTransformShift + ( needsSqrt2 ? -0.5 : 0.0 );
  dErrScale = dErrScale*pow( 2.0, ( -2.0*dTransShift ) );                     // Compensate for scaling through forward transform

  for( i = 0; i < uiMaxNumCoeff; i++ )
  {
    pdErrScale[i] = dErrScale / piQuantcoeff[i] / piQuantcoeff[i]
                    / (1 << (DISTORTION_PRECISION_ADJUSTMENT(bitDepths.recon[channelType]) << 1));
  }

  int QStep = g_quantScales[needsSqrt2][qp];

  xGetErrScaleCoeffNoScalingList(list, sizeX, sizeY, qp) =
    dErrScale / QStep / QStep / (1 << (DISTORTION_PRECISION_ADJUSTMENT(bitDepths.recon[channelType]) << 1));
}

/** set flat matrix value to quantized coefficient
 */
void QuantRDOQ::setFlatScalingList(const int maxLog2TrDynamicRange[MAX_NUM_CHANNEL_TYPE], const BitDepths &bitDepths)
{
  Quant::setFlatScalingList( maxLog2TrDynamicRange, bitDepths );

  const int minimumQp = 0;
  const int maximumQp = SCALING_LIST_REM_NUM;

  for(uint32_t sizeX = 0; sizeX < SCALING_LIST_SIZE_NUM; sizeX++)
  {
    for(uint32_t sizeY = 0; sizeY < SCALING_LIST_SIZE_NUM; sizeY++)
    {
      for(uint32_t list = 0; list < SCALING_LIST_NUM; list++)
      {
        for(int qp = minimumQp; qp < maximumQp; qp++)
        {
          xSetErrScaleCoeff( list, sizeX, sizeY, qp, maxLog2TrDynamicRange, bitDepths );
        }
      }
    }
  }
}

/** initialization process of scaling list array
 */
void QuantRDOQ::xInitScalingList( const QuantRDOQ* other )
{
  m_isErrScaleListOwner = other == nullptr;

  size_t numElements = 0;

  for (uint32_t sizeIdX = 0; sizeIdX < SCALING_LIST_SIZE_NUM; sizeIdX++)
  {
    for (uint32_t sizeIdY = 0; sizeIdY < SCALING_LIST_SIZE_NUM; sizeIdY++)
    {
      for (uint32_t qp = 0; qp < SCALING_LIST_REM_NUM; qp++)
      {
        for (uint32_t listId = 0; listId < SCALING_LIST_NUM; listId++)
        {
          numElements += g_scalingListSizeX[sizeIdX] * g_scalingListSizeX[sizeIdY];
        }
      }
    }
  }

  if (m_isErrScaleListOwner)
  {
    m_errScale[0][0][0][0] = new double[numElements];
  }

  size_t offset = 0;

  for(uint32_t sizeIdX = 0; sizeIdX < SCALING_LIST_SIZE_NUM; sizeIdX++)
  {
    for(uint32_t sizeIdY = 0; sizeIdY < SCALING_LIST_SIZE_NUM; sizeIdY++)
    {
      for(uint32_t qp = 0; qp < SCALING_LIST_REM_NUM; qp++)
      {
        for(uint32_t listId = 0; listId < SCALING_LIST_NUM; listId++)
        {
          if( m_isErrScaleListOwner )
          {
            m_errScale[sizeIdX][sizeIdY][listId][qp] = m_errScale[0][0][0][0] + offset;
            offset += g_scalingListSizeX[sizeIdX] * g_scalingListSizeX[sizeIdY];
          }
          else
          {
            m_errScale[sizeIdX][sizeIdY][listId][qp] = other->m_errScale[sizeIdX][sizeIdY][listId][qp];
          }
        } // listID loop
      }
    }
  }
}

/** destroy quantization matrix array
 */
void QuantRDOQ::xDestroyScalingList()
{
  if (!m_isErrScaleListOwner)
  {
    return;
  }

  delete[] m_errScale[0][0][0][0];
}

void QuantRDOQ::quant(TransformUnit &tu, const ComponentID &compID, const CCoeffBuf &pSrc, TCoeff &absSum,
                      const QpParam &cQP, const Ctx &ctx)
{
  const CompArea &rect      = tu.blocks[compID];
  const uint32_t uiWidth        = rect.width;
  const uint32_t uiHeight       = rect.height;

  const CCoeffBuf &piCoef   = pSrc;
  CoeffBuf         piQCoef  = tu.getCoeffs(compID);

  const bool useTransformSkip = (tu.mtsIdx[compID] == MtsType::SKIP);

  bool useRDOQ = useTransformSkip ? m_useRDOQTS : m_useRDOQ;

  if (tu.cu->ispMode == ISPType::NONE || !isLuma(compID))
  {
    useRDOQ &= uiWidth > 2;
    useRDOQ &= uiHeight > 2;
  }

  if (tu.cs->slice->isLossless())
  {
    useRDOQ = false;
  }

  if (useRDOQ && (isLuma(compID) || RDOQ_CHROMA))
  {
    if (!m_useSelectiveRDOQ || xNeedRDOQ(tu, compID, piCoef, cQP))
    {
      if( useTransformSkip )
      {
        if (tu.cu->getBdpcmMode(compID) != BdpcmMode::NONE)
        {
          forwardBDPCM(tu, compID, pSrc, absSum, cQP, ctx);
        }
        else
        {
          xRateDistOptQuantTS(tu, compID, pSrc, absSum, cQP, ctx);
        }
      }
      else
      {
        xRateDistOptQuant(tu, compID, pSrc, absSum, cQP, ctx);
      }
    }
    else
    {
      piQCoef.fill(0);
      absSum = 0;
    }
  }
  else
  {
    Quant::quant(tu, compID, pSrc, absSum, cQP, ctx);
  }
}

void QuantRDOQ::xRateDistOptQuant(TransformUnit &tu, const ComponentID &compID, const CCoeffBuf &pSrc, TCoeff &absSum,
                                  const QpParam &cQP, const Ctx &ctx)
{
  const FracBitsAccess& fracBits = ctx.getFracBitsAcess();

  const SPS &sps            = *tu.cs->sps;
  const CompArea &rect      = tu.blocks[compID];
  const uint32_t uiWidth        = rect.width;
  const uint32_t uiHeight       = rect.height;
  const ChannelType chType  = toChannelType(compID);
  const int channelBitDepth = sps.getBitDepth( chType );

  const bool extendedPrecision     = sps.getSpsRangeExtension().getExtendedPrecisionProcessingFlag();
  const int  maxLog2TrDynamicRange = sps.getMaxLog2TrDynamicRange(chType);

  const bool useIntraSubPartitions = tu.cu->ispMode != ISPType::NONE && isLuma(compID);
  /* for 422 chroma blocks, the effective scaling applied during transformation is not a power of 2, hence it cannot be
  * implemented as a bit-shift (the quantised result will be sqrt(2) * larger than required). Alternatively, adjust the
  * uiLog2TrSize applied in iTransformShift, such that the result is 1/sqrt(2) the required result (i.e. smaller)
  * Then a QP+3 (sqrt(2)) or QP-3 (1/sqrt(2)) method could be used to get the required result
  */

  // Represents scaling through forward transform
  int iTransformShift = getTransformShift(channelBitDepth, rect.size(), maxLog2TrDynamicRange);

  if (tu.mtsIdx[compID] == MtsType::SKIP && extendedPrecision)
  {
    iTransformShift = std::max<int>(0, iTransformShift);
  }

  double     d64BlockUncodedCost               = 0;
  const uint32_t uiLog2BlockWidth                  = floorLog2(uiWidth);
  const uint32_t uiLog2BlockHeight                 = floorLog2(uiHeight);
  const uint32_t uiMaxNumCoeff                     = rect.area();

  CHECK(compID >= MAX_NUM_TBLOCKS, "Invalid component ID");

  int scalingListType = getScalingListType(tu.cu->predMode, compID);

  CHECK(scalingListType >= SCALING_LIST_NUM, "Invalid scaling list");

  const TCoeff *plSrcCoeff = pSrc.buf;
        TCoeff *piDstCoeff = tu.getCoeffs(compID).buf;

  double *pdCostCoeff  = m_pdCostCoeff;
  double *pdCostSig    = m_pdCostSig;
  double *pdCostCoeff0 = m_pdCostCoeff0;
  int    *rateIncUp    = m_rateIncUp;
  int    *rateIncDown  = m_rateIncDown;
  int    *sigRateDelta = m_sigRateDelta;
  TCoeff *deltaU       = m_deltaU;

  memset(piDstCoeff, 0, sizeof(*piDstCoeff) * uiMaxNumCoeff);
  memset( m_pdCostCoeff,  0, sizeof( double ) *  uiMaxNumCoeff );
  memset( m_pdCostSig,    0, sizeof( double ) *  uiMaxNumCoeff );
  memset( m_rateIncUp,    0, sizeof( int    ) *  uiMaxNumCoeff );
  memset( m_rateIncDown,  0, sizeof( int    ) *  uiMaxNumCoeff );
  memset( m_sigRateDelta, 0, sizeof( int    ) *  uiMaxNumCoeff );
  memset( m_deltaU,       0, sizeof( TCoeff ) *  uiMaxNumCoeff );


  const bool needSqrtAdjustment= TU::needsBlockSizeTrafoScale( tu, compID );
  const bool isTransformSkip   = tu.mtsIdx[compID] == MtsType::SKIP;
  const double *const pdErrScale = xGetErrScaleCoeffSL(scalingListType, uiLog2BlockWidth, uiLog2BlockHeight, cQP.rem(isTransformSkip));
  const int    *const piQCoef    = getQuantCoeff(scalingListType, cQP.rem(isTransformSkip), uiLog2BlockWidth, uiLog2BlockHeight);

  const bool   disableSMForLFNST = tu.cs->slice->getExplicitScalingListUsed() ? tu.cs->slice->getSPS()->getDisableScalingMatrixForLfnstBlks() : false;
  const bool   isLfnstApplied = tu.cu->lfnstIdx > 0 && (tu.cu->isSepTree() ? true : isLuma(compID));
  const bool   disableSMForACT = tu.cs->slice->getSPS()->getScalingMatrixForAlternativeColourSpaceDisabledFlag() && (tu.cs->slice->getSPS()->getScalingMatrixDesignatedColourSpaceFlag() == tu.cu->colorTransform);
  const bool   enableScalingLists = getUseScalingList(uiWidth, uiHeight, isTransformSkip, isLfnstApplied, disableSMForLFNST, disableSMForACT);
  const int    defaultQuantisationCoefficient = g_quantScales[ needSqrtAdjustment ?1:0][cQP.rem(isTransformSkip)];
  const double defaultErrorScale              = xGetErrScaleCoeffNoScalingList(scalingListType, uiLog2BlockWidth, uiLog2BlockHeight, cQP.rem(isTransformSkip));
  const int iQBits = QUANT_SHIFT + cQP.per(isTransformSkip) + iTransformShift + (needSqrtAdjustment?-1:0);                   // Right shift of non-RDOQ quantizer;  level = (coeff*uiQ + offset)>>q_bits


  const TCoeff entropyCodingMinimum = -(1 << maxLog2TrDynamicRange);
  const TCoeff entropyCodingMaximum =  (1 << maxLog2TrDynamicRange) - 1;

  CoeffCodingContext cctx(tu, compID, tu.cs->slice->getSignDataHidingEnabledFlag(), tu.cu->getBdpcmMode(compID));
  int baseLevel = cctx.getBaseLevel();
  if (tu.cs->slice->getSPS()->getSpsRangeExtension().getPersistentRiceAdaptationEnabledFlag())
  {
    unsigned riceStats = ctx.getGRAdaptStats((unsigned)compID);
    TCoeff historyValue = (TCoeff)1 << riceStats;
    cctx.setHistValue(historyValue);
  }
  else
  {
    cctx.setHistValue(0);
  }
  const int    iCGSizeM1      = (1 << cctx.log2CGSize()) - 1;

  int     iCGLastScanPos      = -1;
  double  d64BaseCost         = 0;
  int     iLastScanPos        = -1;

  int ctxBinSampleRatio = (compID == COMPONENT_Y) ? MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_LUMA : MAX_TU_LEVEL_CTX_CODED_BIN_CONSTRAINT_CHROMA;
  int remRegBins = (tu.getTbAreaAfterCoefZeroOut(compID) * ctxBinSampleRatio) >> 4;
  uint32_t  goRiceParam   = 0;

  double *pdCostCoeffGroupSig = m_pdCostCoeffGroupSig;
  memset( pdCostCoeffGroupSig, 0, ( uiMaxNumCoeff >> cctx.log2CGSize() ) * sizeof( double ) );
  int iScanPos;
  coeffGroupRDStats rdStats;

#if ENABLE_TRACING
  DTRACE( g_trace_ctx, D_RDOQ, "%d: %3d, %3d, %dx%d, comp=%d\n", DTRACE_GET_COUNTER( g_trace_ctx, D_RDOQ ), rect.x, rect.y, rect.width, rect.height, compID );
#endif

  const uint32_t lfnstIdx = tu.cu->lfnstIdx;

  const int iCGNum = lfnstIdx > 0 ? 1 : getNonzeroTuSize(uiWidth) * getNonzeroTuSize(uiHeight) >> cctx.log2CGSize();

  for (int subSetId = iCGNum - 1; subSetId >= 0; subSetId--)
  {
    cctx.initSubblock( subSetId );

    uint32_t maxNonZeroPosInCG = iCGSizeM1;
    if( lfnstIdx > 0 && ( ( uiWidth == 4 && uiHeight == 4 ) || ( uiWidth == 8 && uiHeight == 8 && cctx.cgPosX() == 0 && cctx.cgPosY() == 0 ) ) )
    {
      maxNonZeroPosInCG = 7;
    }

    memset( &rdStats, 0, sizeof (coeffGroupRDStats));

    for( int iScanPosinCG = iCGSizeM1; iScanPosinCG > maxNonZeroPosInCG; iScanPosinCG-- )
    {
      iScanPos = cctx.minSubPos() + iScanPosinCG;
      uint32_t    blkPos = cctx.blockPos( iScanPos );
      piDstCoeff[ blkPos ] = 0;
    }
    for( int iScanPosinCG = maxNonZeroPosInCG; iScanPosinCG >= 0; iScanPosinCG-- )
    {
      iScanPos = cctx.minSubPos() + iScanPosinCG;
      //===== quantization =====
      uint32_t    uiBlkPos          = cctx.blockPos(iScanPos);

      // set coeff
      const int    quantisationCoefficient = (enableScalingLists) ? piQCoef   [uiBlkPos]               : defaultQuantisationCoefficient;
      const double errorScale              = (enableScalingLists) ? pdErrScale[uiBlkPos]               : defaultErrorScale;
      const int64_t  tmpLevel                = int64_t(abs(plSrcCoeff[ uiBlkPos ])) * quantisationCoefficient;

      const Intermediate_Int lLevelDouble  = (Intermediate_Int)std::min<int64_t>(tmpLevel, std::numeric_limits<Intermediate_Int>::max() - (Intermediate_Int(1) << (iQBits - 1)));

      uint32_t uiMaxAbsLevel        = std::min<uint32_t>(uint32_t(entropyCodingMaximum), uint32_t((lLevelDouble + (Intermediate_Int(1) << (iQBits - 1))) >> iQBits));

      const double dErr         = double( lLevelDouble );
      pdCostCoeff0[ iScanPos ]  = dErr * dErr * errorScale;
      d64BlockUncodedCost      += pdCostCoeff0[ iScanPos ];
      piDstCoeff[ uiBlkPos ]    = uiMaxAbsLevel;

      if ( uiMaxAbsLevel > 0 && iLastScanPos < 0 )
      {
        iLastScanPos            = iScanPos;
        iCGLastScanPos          = cctx.subSetId();
      }

      if ( iLastScanPos >= 0 )
      {

#if ENABLE_TRACING
        uint32_t uiCGPosY = cctx.cgPosX();
        uint32_t uiCGPosX = cctx.cgPosY();
        uint32_t uiPosY = cctx.posY( iScanPos );
        uint32_t uiPosX = cctx.posX( iScanPos );
        DTRACE( g_trace_ctx, D_RDOQ, "%d [%d][%d][%2d:%2d][%2d:%2d]", DTRACE_GET_COUNTER( g_trace_ctx, D_RDOQ ), iScanPos, uiBlkPos, uiCGPosX, uiCGPosY, uiPosX, uiPosY );
#endif
        //===== coefficient level estimation =====
        unsigned ctxIdSig = 0;
        if( iScanPos != iLastScanPos )
        {
          ctxIdSig = cctx.sigCtxIdAbs( iScanPos, piDstCoeff, 0 );
        }
        uint32_t    uiLevel;
        uint8_t ctxOffset     = cctx.ctxOffsetAbs     ();
        uint32_t    uiParCtx      = cctx.parityCtxIdAbs   ( ctxOffset );
        uint32_t    uiGt1Ctx      = cctx.greater1CtxIdAbs ( ctxOffset );
        uint32_t    uiGt2Ctx      = cctx.greater2CtxIdAbs ( ctxOffset );
        uint32_t    goRiceZero    = 0;
        if( remRegBins < 4 )
        {
          goRiceParam = (cctx.*(cctx.deriveRiceRRC))(iScanPos, piDstCoeff, 0);
          goRiceZero       = g_goRicePosCoeff0(0, goRiceParam);
        }

        const BinFracBits fracBitsPar = fracBits.getFracBitsArray( uiParCtx );
        const BinFracBits fracBitsGt1 = fracBits.getFracBitsArray( uiGt1Ctx );
        const BinFracBits fracBitsGt2 = fracBits.getFracBitsArray( uiGt2Ctx );

        if( iScanPos == iLastScanPos )
        {
          uiLevel = xGetCodedLevel( pdCostCoeff[ iScanPos ], pdCostCoeff0[ iScanPos ], pdCostSig[ iScanPos ],
                                    lLevelDouble, uiMaxAbsLevel, nullptr, fracBitsPar, fracBitsGt1, fracBitsGt2, remRegBins, goRiceZero, goRiceParam, iQBits, errorScale, 1, extendedPrecision, maxLog2TrDynamicRange );
        }
        else
        {
          DTRACE_COND( ( uiMaxAbsLevel != 0 ), g_trace_ctx, D_RDOQ_MORE, " uiCtxSig=%d", ctxIdSig );

          const BinFracBits fracBitsSig = fracBits.getFracBitsArray( ctxIdSig );
          uiLevel = xGetCodedLevel( pdCostCoeff[ iScanPos ], pdCostCoeff0[ iScanPos ], pdCostSig[ iScanPos ],
                                    lLevelDouble, uiMaxAbsLevel, &fracBitsSig, fracBitsPar, fracBitsGt1, fracBitsGt2, remRegBins, goRiceZero, goRiceParam, iQBits, errorScale, 0, extendedPrecision, maxLog2TrDynamicRange );
          sigRateDelta[ uiBlkPos ] = ( remRegBins < 4 ? 0 : fracBitsSig.intBits[1] - fracBitsSig.intBits[0] );
        }

        DTRACE( g_trace_ctx, D_RDOQ, " Lev=%d \n", uiLevel );
        DTRACE_COND( ( uiMaxAbsLevel != 0 ), g_trace_ctx, D_RDOQ, " CostC0=%d\n", (int64_t)( pdCostCoeff0[iScanPos] ) );
        DTRACE_COND( ( uiMaxAbsLevel != 0 ), g_trace_ctx, D_RDOQ, " CostC =%d\n", (int64_t)( pdCostCoeff[iScanPos] ) );

        deltaU[ uiBlkPos ]        = TCoeff((lLevelDouble - (Intermediate_Int(uiLevel) << iQBits)) >> (iQBits-8));

        if( uiLevel > 0 )
        {
          int rateNow              = xGetICRate( uiLevel,   fracBitsPar, fracBitsGt1, fracBitsGt2, remRegBins, goRiceZero, goRiceParam, extendedPrecision, maxLog2TrDynamicRange );
          rateIncUp   [ uiBlkPos ] = xGetICRate( uiLevel+1, fracBitsPar, fracBitsGt1, fracBitsGt2, remRegBins, goRiceZero, goRiceParam, extendedPrecision, maxLog2TrDynamicRange ) - rateNow;
          rateIncDown [ uiBlkPos ] = xGetICRate( uiLevel-1, fracBitsPar, fracBitsGt1, fracBitsGt2, remRegBins, goRiceZero, goRiceParam, extendedPrecision, maxLog2TrDynamicRange ) - rateNow;
        }
        else // uiLevel == 0
        {
          if( remRegBins < 4 )
          {
            int rateNow            = xGetICRate( uiLevel,   fracBitsPar, fracBitsGt1, fracBitsGt2, remRegBins, goRiceZero, goRiceParam, extendedPrecision, maxLog2TrDynamicRange );
            rateIncUp [ uiBlkPos ] = xGetICRate( uiLevel+1, fracBitsPar, fracBitsGt1, fracBitsGt2, remRegBins, goRiceZero, goRiceParam, extendedPrecision, maxLog2TrDynamicRange ) - rateNow;
          }
          else
          {
            rateIncUp [ uiBlkPos ] = fracBitsGt1.intBits[ 0 ];
          }
        }
        piDstCoeff[ uiBlkPos ] = uiLevel;
        d64BaseCost           += pdCostCoeff [ iScanPos ];

        if( ( (iScanPos & iCGSizeM1) == 0 ) && ( iScanPos > 0 ) )
        {
          goRiceParam   = 0;
        }
        else if( remRegBins >= 4 )
        {
          goRiceParam = (cctx.*(cctx.deriveRiceRRC))(iScanPos, piDstCoeff, baseLevel);
          remRegBins -= (uiLevel < 2 ? uiLevel : 3) + (iScanPos != iLastScanPos);
        }
      }
      else
      {
        d64BaseCost    += pdCostCoeff0[ iScanPos ];
      }
      rdStats.d64SigCost += pdCostSig[ iScanPos ];
      if (iScanPosinCG == 0 )
      {
        rdStats.d64SigCost_0 = pdCostSig[ iScanPos ];
      }
      if (piDstCoeff[ uiBlkPos ] )
      {
        cctx.setSigGroup();
        rdStats.d64CodedLevelandDist += pdCostCoeff[ iScanPos ] - pdCostSig[ iScanPos ];
        rdStats.d64UncodedDist += pdCostCoeff0[ iScanPos ];
        if ( iScanPosinCG != 0 )
        {
          rdStats.iNNZbeforePos0++;
        }
      }
    } //end for (iScanPosinCG)

    if (iCGLastScanPos >= 0)
    {
      if( cctx.subSetId() )
      {
        if( !cctx.isSigGroup() )
        {
          const BinFracBits fracBitsSigGroup = fracBits.getFracBitsArray( cctx.sigGroupCtxId() );
          d64BaseCost += xGetRateSigCoeffGroup(fracBitsSigGroup, 0) - rdStats.d64SigCost;
          pdCostCoeffGroupSig[ cctx.subSetId() ] = xGetRateSigCoeffGroup(fracBitsSigGroup, 0);
        }
        else
        {
          if (cctx.subSetId() < iCGLastScanPos) //skip the last coefficient group, which will be handled together with last position below.
          {
            if ( rdStats.iNNZbeforePos0 == 0 )
            {
              d64BaseCost -= rdStats.d64SigCost_0;
              rdStats.d64SigCost -= rdStats.d64SigCost_0;
            }
            // rd-cost if SigCoeffGroupFlag = 0, initialization
            double d64CostZeroCG = d64BaseCost;

            const BinFracBits fracBitsSigGroup = fracBits.getFracBitsArray( cctx.sigGroupCtxId() );

            if (cctx.subSetId() < iCGLastScanPos)
            {
              d64BaseCost  += xGetRateSigCoeffGroup(fracBitsSigGroup,1);
              d64CostZeroCG += xGetRateSigCoeffGroup(fracBitsSigGroup,0);
              pdCostCoeffGroupSig[ cctx.subSetId() ] = xGetRateSigCoeffGroup(fracBitsSigGroup,1);
            }

            // try to convert the current coeff group from non-zero to all-zero
            d64CostZeroCG += rdStats.d64UncodedDist;  // distortion for resetting non-zero levels to zero levels
            d64CostZeroCG -= rdStats.d64CodedLevelandDist;   // distortion and level cost for keeping all non-zero levels
            d64CostZeroCG -= rdStats.d64SigCost;     // sig cost for all coeffs, including zero levels and non-zerl levels

                                                     // if we can save cost, change this block to all-zero block
            if ( d64CostZeroCG < d64BaseCost )
            {
              cctx.resetSigGroup();
              d64BaseCost = d64CostZeroCG;
              if (cctx.subSetId() < iCGLastScanPos)
              {
                pdCostCoeffGroupSig[ cctx.subSetId() ] = xGetRateSigCoeffGroup(fracBitsSigGroup,0);
              }
              // reset coeffs to 0 in this block
              for( int iScanPosinCG = maxNonZeroPosInCG; iScanPosinCG >= 0; iScanPosinCG-- )
              {
                iScanPos      = cctx.minSubPos() + iScanPosinCG;
                uint32_t uiBlkPos = cctx.blockPos( iScanPos );

                if (piDstCoeff[ uiBlkPos ])
                {
                  piDstCoeff [ uiBlkPos ] = 0;
                  pdCostCoeff[ iScanPos ] = pdCostCoeff0[ iScanPos ];
                  pdCostSig  [ iScanPos ] = 0;
                }
              }
            } // end if ( d64CostAllZeros < d64BaseCost )
          }
        } // end if if (uiSigCoeffGroupFlag[ uiCGBlkPos ] == 0)
      }
      else
      {
        cctx.setSigGroup();
      }
    }
  } //end for (cctx.subSetId)


  //===== estimate last position =====
  if ( iLastScanPos < 0 )
  {
    return;
  }

  double  d64BestCost         = 0;
  int     iBestLastIdxP1      = 0;


  if( !CU::isIntra( *tu.cu ) && isLuma( compID ) && tu.depth == 0 )
  {
    const BinFracBits fracBitsQtRootCbf = fracBits.getFracBitsArray( Ctx::QtRootCbf() );
    d64BestCost  = d64BlockUncodedCost + xGetICost( fracBitsQtRootCbf.intBits[ 0 ] );
    d64BaseCost += xGetICost( fracBitsQtRootCbf.intBits[ 1 ] );
  }
  else
  {
    bool previousCbf       = tu.cbf[COMPONENT_Cb];
    bool lastCbfIsInferred = false;
    if( useIntraSubPartitions )
    {
      bool rootCbfSoFar       = false;
      bool isLastSubPartition = CU::isISPLast(*tu.cu, tu.Y(), compID);
      uint32_t nTus               = tu.cu->ispMode == ISPType::HOR ? tu.cu->lheight() >> floorLog2(tu.lheight())
                                                                   : tu.cu->lwidth() >> floorLog2(tu.lwidth());
      if( isLastSubPartition )
      {
        TransformUnit* tuPointer = tu.cu->firstTU;
        for( int tuIdx = 0; tuIdx < nTus - 1; tuIdx++ )
        {
          rootCbfSoFar |= TU::getCbfAtDepth(*tuPointer, COMPONENT_Y, tu.depth);
          tuPointer     = tuPointer->next;
        }
        if( !rootCbfSoFar )
        {
          lastCbfIsInferred = true;
        }
      }
      if( !lastCbfIsInferred )
      {
        previousCbf = TU::getPrevTuCbfAtDepth(tu, compID, tu.depth);
      }
    }
    BinFracBits fracBitsQtCbf = fracBits.getFracBitsArray( Ctx::QtCbf[compID]( DeriveCtx::CtxQtCbf( rect.compID, previousCbf, useIntraSubPartitions ) ) );

    if( !lastCbfIsInferred )
    {
      d64BestCost  = d64BlockUncodedCost + xGetICost(fracBitsQtCbf.intBits[0]);
      d64BaseCost += xGetICost(fracBitsQtCbf.intBits[1]);
    }
    else
    {
      d64BestCost  = d64BlockUncodedCost;
    }
  }

  int lastBitsX[LAST_SIGNIFICANT_GROUPS] = { 0 };
  int lastBitsY[LAST_SIGNIFICANT_GROUPS] = { 0 };
  {
    int dim1  = getNonzeroTuSize(uiWidth);
    int dim2  = getNonzeroTuSize(uiHeight);
    int bitsX = 0;
    int bitsY = 0;
    int ctxId;
    //X-coordinate
    for (ctxId = 0; ctxId < g_groupIdx[dim1 - 1]; ctxId++)
    {
      const BinFracBits fB = fracBits.getFracBitsArray( cctx.lastXCtxId(ctxId) );
      lastBitsX[ ctxId ]   = bitsX + fB.intBits[ 0 ];
      bitsX               +=         fB.intBits[ 1 ];
    }
    lastBitsX[ctxId] = bitsX;
    //Y-coordinate
    for (ctxId = 0; ctxId < g_groupIdx[dim2 - 1]; ctxId++)
    {
      const BinFracBits fB = fracBits.getFracBitsArray( cctx.lastYCtxId(ctxId) );
      lastBitsY[ ctxId ]   = bitsY + fB.intBits[ 0 ];
      bitsY               +=         fB.intBits[ 1 ];
    }
    lastBitsY[ctxId] = bitsY;
  }

  unsigned zoTbWdith  = getNonzeroTuSize(cctx.width());
  unsigned zoTbHeight = getNonzeroTuSize(cctx.height());
  if (tu.cs->sps->getMtsEnabled() && tu.cu->sbtInfo != 0 && tu.blocks[compID].width <= 32
      && tu.blocks[compID].height <= 32 && compID == COMPONENT_Y)
  {
    zoTbWdith  = (tu.blocks[compID].width == 32) ? 16 : zoTbWdith;
    zoTbHeight = (tu.blocks[compID].height == 32) ? 16 : zoTbHeight;
  }

  bool bFoundLast = false;
  for (int iCGScanPos = iCGLastScanPos; iCGScanPos >= 0; iCGScanPos--)
  {
    d64BaseCost -= pdCostCoeffGroupSig [ iCGScanPos ];
    if (cctx.isSigGroup( iCGScanPos ) )
    {
      uint32_t maxNonZeroPosInCG = iCGSizeM1;
      if( lfnstIdx > 0 && ( ( uiWidth == 4 && uiHeight == 4 ) || ( uiWidth == 8 && uiHeight == 8 && cctx.cgPosX() == 0 && cctx.cgPosY() == 0 ) ) )
      {
        maxNonZeroPosInCG = 7;
      }
      for( int iScanPosinCG = maxNonZeroPosInCG; iScanPosinCG >= 0; iScanPosinCG-- )
      {
        iScanPos = iCGScanPos * (iCGSizeM1 + 1) + iScanPosinCG;

        if (iScanPos > iLastScanPos)
        {
          continue;
        }
        uint32_t   uiBlkPos     = cctx.blockPos( iScanPos );

        if( piDstCoeff[ uiBlkPos ] )
        {
          uint32_t   uiPosY = uiBlkPos >> uiLog2BlockWidth;
          uint32_t   uiPosX = uiBlkPos - ( uiPosY << uiLog2BlockWidth );
          if (tu.cu->slice->getReverseLastSigCoeffFlag())
          {
            uiPosX = zoTbWdith - 1 - uiPosX;
            uiPosY = zoTbHeight - 1 - uiPosY;
          }
          double d64CostLast  = xGetRateLast( lastBitsX, lastBitsY, uiPosX, uiPosY );

          double totalCost = d64BaseCost + d64CostLast - pdCostSig[ iScanPos ];

          if( totalCost < d64BestCost )
          {
            iBestLastIdxP1  = iScanPos + 1;
            d64BestCost     = totalCost;
          }
          if( piDstCoeff[ uiBlkPos ] > 1 )
          {
            bFoundLast = true;
            break;
          }
          d64BaseCost      -= pdCostCoeff[ iScanPos ];
          d64BaseCost      += pdCostCoeff0[ iScanPos ];
        }
        else
        {
          d64BaseCost      -= pdCostSig[ iScanPos ];
        }
      } //end for
      if (bFoundLast)
      {
        break;
      }
    } // end if (uiSigCoeffGroupFlag[ uiCGBlkPos ])
    DTRACE( g_trace_ctx, D_RDOQ_COST, "%d: %3d, %3d, %dx%d, comp=%d\n", DTRACE_GET_COUNTER( g_trace_ctx, D_RDOQ_COST ), rect.x, rect.y, rect.width, rect.height, compID );
    DTRACE( g_trace_ctx, D_RDOQ_COST, "Uncoded=%d\n", (int64_t)( d64BlockUncodedCost ) );
    DTRACE( g_trace_ctx, D_RDOQ_COST, "Coded  =%d\n", (int64_t)( d64BaseCost ) );

  } // end for


  for ( int scanPos = 0; scanPos < iBestLastIdxP1; scanPos++ )
  {
    int blkPos = cctx.blockPos( scanPos );
    TCoeff level = piDstCoeff[ blkPos ];
    absSum += level;
    piDstCoeff[ blkPos ] = ( plSrcCoeff[ blkPos ] < 0 ) ? -level : level;
  }

  //===== clean uncoded coefficients =====
  for ( int scanPos = iBestLastIdxP1; scanPos <= iLastScanPos; scanPos++ )
  {
    piDstCoeff[ cctx.blockPos( scanPos ) ] = 0;
  }

  if (cctx.signHiding() && absSum >= 2)
  {
    const double inverseQuantScale = double(g_invQuantScales[0][cQP.rem(isTransformSkip)]);
    int64_t rdFactor = (int64_t)(inverseQuantScale * inverseQuantScale * (1 << (2 * cQP.per(isTransformSkip))) / m_dLambda / 16
                                  / (1 << (2 * DISTORTION_PRECISION_ADJUSTMENT(channelBitDepth)))
                             + 0.5);

    int lastCG = -1;
    int absSum = 0 ;
    int n ;
    for (int subSet = iCGNum - 1; subSet >= 0; subSet--)
    {
      int  subPos         = subSet << cctx.log2CGSize();
      int  firstNZPosInCG = iCGSizeM1 + 1, lastNZPosInCG = -1;
      absSum = 0 ;

      for( n = iCGSizeM1; n >= 0; --n )
      {
        if( piDstCoeff[ cctx.blockPos( n + subPos )] )
        {
          lastNZPosInCG = n;
          break;
        }
      }

      for( n = 0; n <= iCGSizeM1; n++ )
      {
        if( piDstCoeff[ cctx.blockPos( n + subPos )] )
        {
          firstNZPosInCG = n;
          break;
        }
      }

      for( n = firstNZPosInCG; n <= lastNZPosInCG; n++ )
      {
        absSum += int(piDstCoeff[ cctx.blockPos( n + subPos )]);
      }

      if(lastNZPosInCG>=0 && lastCG==-1)
      {
        lastCG = 1;
      }

      if( lastNZPosInCG-firstNZPosInCG>=SBH_THRESHOLD )
      {
        uint32_t signbit = (piDstCoeff[cctx.blockPos(subPos+firstNZPosInCG)]>0?0:1);
        if( signbit!=(absSum&0x1) )  // hide but need tune
        {
          // calculate the cost
          int64_t minCostInc = std::numeric_limits<int64_t>::max(), curCost = std::numeric_limits<int64_t>::max();
          int minPos = -1, finalChange = 0, curChange = 0;

          for( n = (lastCG == 1 ? lastNZPosInCG : iCGSizeM1); n >= 0; --n )
          {
            uint32_t uiBlkPos   = cctx.blockPos( n + subPos );
            if(piDstCoeff[ uiBlkPos ] != 0 )
            {
              int64_t costUp   = rdFactor * ( - deltaU[uiBlkPos] ) + rateIncUp[uiBlkPos];
              int64_t costDown = rdFactor * (   deltaU[uiBlkPos] ) + rateIncDown[uiBlkPos]
                -   ((abs(piDstCoeff[uiBlkPos]) == 1) ? sigRateDelta[uiBlkPos] : 0);

              if(lastCG==1 && lastNZPosInCG==n && abs(piDstCoeff[uiBlkPos])==1)
              {
                costDown -= (4<<SCALE_BITS);
              }

              if(costUp<costDown)
              {
                curCost = costUp;
                curChange =  1;
              }
              else
              {
                curChange = -1;
                if(n==firstNZPosInCG && abs(piDstCoeff[uiBlkPos])==1)
                {
                  curCost = std::numeric_limits<int64_t>::max();
                }
                else
                {
                  curCost = costDown;
                }
              }
            }
            else
            {
              curCost = rdFactor * ( - (abs(deltaU[uiBlkPos])) ) + (1<<SCALE_BITS) + rateIncUp[uiBlkPos] + sigRateDelta[uiBlkPos] ;
              curChange = 1 ;

              if(n<firstNZPosInCG)
              {
                uint32_t thissignbit = (plSrcCoeff[uiBlkPos]>=0?0:1);
                if(thissignbit != signbit )
                {
                  curCost = std::numeric_limits<int64_t>::max();
                }
              }
            }

            if( curCost<minCostInc)
            {
              minCostInc = curCost;
              finalChange = curChange;
              minPos = uiBlkPos;
            }
          }

          if(piDstCoeff[minPos] == entropyCodingMaximum || piDstCoeff[minPos] == entropyCodingMinimum)
          {
            finalChange = -1;
          }

          if(plSrcCoeff[minPos]>=0)
          {
            piDstCoeff[minPos] += finalChange ;
          }
          else
          {
            piDstCoeff[minPos] -= finalChange ;
          }
        }
      }

      if(lastCG==1)
      {
        lastCG=0 ;
      }
    }
  }
}

void QuantRDOQ::xRateDistOptQuantTS( TransformUnit &tu, const ComponentID &compID, const CCoeffBuf &coeffs, TCoeff &absSum, const QpParam &qp, const Ctx &ctx )
{
  const FracBitsAccess& fracBits = ctx.getFracBitsAcess();

  const SPS &sps            = *tu.cs->sps;
  const CompArea &rect      = tu.blocks[compID];
  const uint32_t width      = rect.width;
  const uint32_t height     = rect.height;
  const ChannelType chType  = toChannelType(compID);
  const int channelBitDepth = sps.getBitDepth( chType );

  const bool extendedPrecision     = sps.getSpsRangeExtension().getExtendedPrecisionProcessingFlag();
  const int  maxLog2TrDynamicRange = sps.getMaxLog2TrDynamicRange(chType);

  int transformShift = getTransformShift( channelBitDepth, rect.size(), maxLog2TrDynamicRange );

  if( extendedPrecision )
  {
    transformShift = std::max<int>( 0, transformShift );
  }

  const uint32_t maxNumCoeff                        = rect.area();

  CHECK( compID >= MAX_NUM_TBLOCKS, "Invalid component ID" );

  int scalingListType = getScalingListType( tu.cu->predMode, compID );
  CHECK( scalingListType >= SCALING_LIST_NUM, "Invalid scaling list" );

  const TCoeff *srcCoeff = coeffs.buf;
        TCoeff *dstCoeff = tu.getCoeffs( compID ).buf;

  double *costCoeff  = m_pdCostCoeff;
  double *costSig    = m_pdCostSig;
  double *costCoeff0 = m_pdCostCoeff0;

  memset( m_pdCostCoeff,  0, sizeof( double ) *  maxNumCoeff );
  memset( m_pdCostSig,    0, sizeof( double ) *  maxNumCoeff );

  m_bdpcm = BdpcmMode::NONE;

  const bool   needsSqrt2Scale = TU::needsSqrt2Scale( tu, compID );  // should always be false - transform-skipped blocks don't require sqrt(2) compensation.
  const bool   isTransformSkip         = tu.mtsIdx[compID] == MtsType::SKIP;
  const int    qBits = QUANT_SHIFT + qp.per(isTransformSkip) + (isTransformSkip ? 0 : transformShift) + (needsSqrt2Scale ? -1 : 0);  // Right shift of non-RDOQ quantizer;  level = (coeff*uiQ + offset)>>q_bits
  const int    quantisationCoefficient = g_quantScales[needsSqrt2Scale?1:0][qp.rem(isTransformSkip)];
  const double errorScale              = xGetErrScaleCoeff( TU::needsSqrt2Scale(tu, compID), width, height, qp.rem(isTransformSkip), maxLog2TrDynamicRange, channelBitDepth, isTransformSkip);

  const TCoeff entropyCodingMaximum = ( 1 << maxLog2TrDynamicRange ) - 1;

  uint32_t coeffLevels[3];
  double   coeffLevelError[4];

  CoeffCodingContext cctx(tu, compID, tu.cs->slice->getSignDataHidingEnabledFlag(), m_bdpcm);
  const int sbSizeM1    = ( 1 << cctx.log2CGSize() ) - 1;
  double    baseCost    = 0;
  uint32_t  goRiceParam = 0;

  double *costSigSubBlock = m_pdCostCoeffGroupSig;
  memset( costSigSubBlock, 0, ( maxNumCoeff >> cctx.log2CGSize() ) * sizeof( double ) );

  const int sbNum = width * height >> cctx.log2CGSize();
  int scanPos;
  coeffGroupRDStats rdStats;

  bool anySigCG = false;

  int maxCtxBins = (cctx.maxNumCoeff() * 7) >> 2;
  cctx.setNumCtxBins(maxCtxBins);

  for( int sbId = 0; sbId < sbNum; sbId++ )
  {
    cctx.initSubblock( sbId );

    int noCoeffCoded = 0;
    baseCost = 0.0;
    memset( &rdStats, 0, sizeof (coeffGroupRDStats));

    rdStats.iNumSbbCtxBins = 0;

    for( int scanPosInSB = 0; scanPosInSB <= sbSizeM1; scanPosInSB++ )
    {
      int lastPosCoded = sbSizeM1;
      scanPos = cctx.minSubPos() + scanPosInSB;
      //===== quantization =====
      uint32_t blkPos = cctx.blockPos( scanPos );

      // set coeff
      const int64_t          tmpLevel    = int64_t( abs( srcCoeff[blkPos] ) ) * quantisationCoefficient;
      const Intermediate_Int levelDouble = (Intermediate_Int)std::min<int64_t>( tmpLevel, std::numeric_limits<Intermediate_Int>::max() - ( Intermediate_Int( 1 ) << ( qBits - 1 ) ) );

      uint32_t roundAbsLevel = std::min<uint32_t>(uint32_t(entropyCodingMaximum), uint32_t((levelDouble + (Intermediate_Int(1) << (qBits - 1))) >> qBits));
      uint32_t minAbsLevel = (roundAbsLevel > 1 ? roundAbsLevel - 1 : 1);

      uint32_t downAbsLevel = std::min<uint32_t>(uint32_t(entropyCodingMaximum), uint32_t(levelDouble >> qBits));
      uint32_t upAbsLevel = std::min<uint32_t>(uint32_t(entropyCodingMaximum), downAbsLevel + 1);

      m_testedLevels = 0;
      coeffLevels[m_testedLevels++] = roundAbsLevel;

      if (minAbsLevel != roundAbsLevel)
      {
        coeffLevels[m_testedLevels++] = minAbsLevel;
      }

      int rightPixel, belowPixel, predPixel;

      cctx.neighTS(rightPixel, belowPixel, scanPos, dstCoeff);
      predPixel = cctx.deriveModCoeff(rightPixel, belowPixel, upAbsLevel, false);

      if (upAbsLevel != roundAbsLevel && upAbsLevel != minAbsLevel && predPixel == 1)
      {
        coeffLevels[m_testedLevels++] = upAbsLevel;
      }

      double dErr = double(levelDouble);
      coeffLevelError[0] = dErr * dErr * errorScale;

      costCoeff0[scanPos] = coeffLevelError[0];
      dstCoeff[blkPos]    = coeffLevels[0];

      //===== coefficient level estimation =====
            unsigned    ctxIdSig = cctx.sigCtxIdAbsTS( scanPos, dstCoeff );
            uint32_t    cLevel;
      const BinFracBits fracBitsPar = fracBits.getFracBitsArray( cctx.parityCtxIdAbsTS() );

      goRiceParam = cctx.templateAbsSumTS( scanPos, dstCoeff );
      if (tu.cu->slice->getSPS()->getSpsRangeExtension().getTSRCRicePresentFlag() && tu.mtsIdx[compID] == MtsType::SKIP)
      {
        goRiceParam = goRiceParam + tu.cu->slice->getTsrcIndex();
      }
      unsigned          ctxIdSign    = cctx.signCtxIdAbsTS(scanPos, dstCoeff, m_bdpcm);
      const BinFracBits fracBitsSign = fracBits.getFracBitsArray(ctxIdSign);
      const uint8_t     sign         = srcCoeff[ blkPos ] < 0 ? 1 : 0;

      DTRACE_COND( ( coeffLevels[0] != 0 ), g_trace_ctx, D_RDOQ_MORE, " uiCtxSig=%d", ctxIdSig );

      unsigned          gt1CtxId    = cctx.lrg1CtxIdAbsTS(scanPos, dstCoeff, m_bdpcm);
      const BinFracBits fracBitsGr1 = fracBits.getFracBitsArray(gt1CtxId);

      const BinFracBits fracBitsSig = fracBits.getFracBitsArray( ctxIdSig );
      bool lastCoeff = false; //
      if (scanPosInSB == lastPosCoded && noCoeffCoded == 0)
      {
        lastCoeff = true;
      }
      int numUsedCtxBins = 0;
      cLevel = xGetCodedLevelTSPred(costCoeff[scanPos], costCoeff0[scanPos], costSig[scanPos], levelDouble, qBits, errorScale, coeffLevels, coeffLevelError,
                                    &fracBitsSig, fracBitsPar, cctx, fracBits, fracBitsSign, fracBitsGr1, sign, rightPixel, belowPixel, goRiceParam, lastCoeff, extendedPrecision, maxLog2TrDynamicRange, numUsedCtxBins);

      cctx.decimateNumCtxBins(numUsedCtxBins);
      rdStats.iNumSbbCtxBins += numUsedCtxBins;


      if (cLevel > 0)
      {
        noCoeffCoded++;
      }

      TCoeff level = cLevel;
      dstCoeff[blkPos] = (level != 0 && srcCoeff[blkPos] < 0) ? -level : level;
      baseCost           += costCoeff[ scanPos ];
      rdStats.d64SigCost += costSig[ scanPos ];

      if( dstCoeff[ blkPos ] )
      {
        cctx.setSigGroup();
        rdStats.d64CodedLevelandDist += costCoeff [ scanPos ] - costSig[ scanPos ];
        rdStats.d64UncodedDist       += costCoeff0[ scanPos ];
      }
    } //end for (iScanPosinCG)

    if( !cctx.isSigGroup() )
    {
      const BinFracBits fracBitsSigGroup = fracBits.getFracBitsArray( cctx.sigGroupCtxId( true ) );
      baseCost += xGetRateSigCoeffGroup( fracBitsSigGroup, 0 ) - rdStats.d64SigCost;
      costSigSubBlock[cctx.subSetId()] = xGetRateSigCoeffGroup( fracBitsSigGroup, 0 );
      cctx.increaseNumCtxBins(rdStats.iNumSbbCtxBins); // skip sub-block
    }
    else if( sbId != sbNum - 1 || anySigCG )
    {
      // rd-cost if SigCoeffGroupFlag = 0, initialization
      double costZeroSB = baseCost;

      const BinFracBits fracBitsSigGroup = fracBits.getFracBitsArray( cctx.sigGroupCtxId( true ) );

      baseCost   += xGetRateSigCoeffGroup( fracBitsSigGroup, 1 );
      costZeroSB += xGetRateSigCoeffGroup( fracBitsSigGroup, 0 );
      costSigSubBlock[ cctx.subSetId() ] = xGetRateSigCoeffGroup( fracBitsSigGroup, 1 );

      costZeroSB += rdStats.d64UncodedDist;         // distortion for resetting non-zero levels to zero levels
      costZeroSB -= rdStats.d64CodedLevelandDist;   // distortion and level cost for keeping all non-zero levels
      costZeroSB -= rdStats.d64SigCost;             // sig cost for all coeffs, including zero levels and non-zerl levels

      if( costZeroSB < baseCost )
      {
        cctx.resetSigGroup();
        baseCost = costZeroSB;
        costSigSubBlock[ cctx.subSetId() ] = xGetRateSigCoeffGroup( fracBitsSigGroup, 0 );
        cctx.increaseNumCtxBins(rdStats.iNumSbbCtxBins); // skip sub-block
        for( int scanPosInSB = 0; scanPosInSB <= sbSizeM1; scanPosInSB++ )
        {
          scanPos = cctx.minSubPos() + scanPosInSB;
          uint32_t blkPos = cctx.blockPos( scanPos );

          if( dstCoeff[ blkPos ] )
          {
            dstCoeff[ blkPos ] = 0;
            costCoeff[ scanPos ] = costCoeff0[ scanPos ];
            costSig[ scanPos] = 0;
          }
        }
      }
      else
      {
        anySigCG = true;
      }
    }
  }

  //===== estimate last position =====
  for( int scanPos = 0; scanPos < maxNumCoeff; scanPos++ )
  {
    int blkPos = cctx.blockPos( scanPos );
    TCoeff level = dstCoeff[ blkPos ];
    absSum += abs(level);
  }
}

void QuantRDOQ::forwardBDPCM(TransformUnit &tu, const ComponentID &compID, const CCoeffBuf &coeffs, TCoeff &absSum,
                             const QpParam &qp, const Ctx &ctx)
{
  const FracBitsAccess& fracBits = ctx.getFracBitsAcess();

  const SPS &sps = *tu.cs->sps;
  const CompArea &rect = tu.blocks[compID];
  const uint32_t width = rect.width;
  const uint32_t height = rect.height;
  const ChannelType chType = toChannelType(compID);
  const int channelBitDepth = sps.getBitDepth(chType);

  const bool extendedPrecision = sps.getSpsRangeExtension().getExtendedPrecisionProcessingFlag();
  const int  maxLog2TrDynamicRange = sps.getMaxLog2TrDynamicRange(chType);
  const BdpcmMode dirMode               = tu.cu->getBdpcmMode(compID);
  int transformShift = getTransformShift(channelBitDepth, rect.size(), maxLog2TrDynamicRange);

  if (extendedPrecision)
  {
    transformShift = std::max<int>(0, transformShift);
  }

  const uint32_t maxNumCoeff = rect.area();

  CHECK(compID >= MAX_NUM_TBLOCKS, "Invalid component ID");

  int scalingListType = getScalingListType(tu.cu->predMode, compID);
  CHECK(scalingListType >= SCALING_LIST_NUM, "Invalid scaling list");

  const TCoeff *srcCoeff = coeffs.buf;
  TCoeff *dstCoeff = tu.getCoeffs(compID).buf;

  double *costCoeff = m_pdCostCoeff;
  double *costSig = m_pdCostSig;
  double *costCoeff0 = m_pdCostCoeff0;

  memset(m_pdCostCoeff, 0, sizeof(double) *  maxNumCoeff);
  memset(m_pdCostSig, 0, sizeof(double) *  maxNumCoeff);
  memset(m_fullCoeff, 0, sizeof(TCoeff) * maxNumCoeff);

  m_bdpcm = dirMode;

  const bool needsSqrt2Scale = TU::needsSqrt2Scale(
    tu, compID);   // should always be false - transform-skipped blocks don't require sqrt(2) compensation.
  const bool isTransformSkip = tu.mtsIdx[compID] == MtsType::SKIP;

  const int    qBits = QUANT_SHIFT + qp.per(isTransformSkip) + (isTransformSkip? 0 : transformShift) + ( needsSqrt2Scale ? -1 : 0);  // Right shift of non-RDOQ quantizer;  level = (coeff*uiQ + offset)>>q_bits
  const int    quantisationCoefficient = g_quantScales[needsSqrt2Scale ? 1 : 0][qp.rem(isTransformSkip)];
  const double errorScale = xGetErrScaleCoeff(TU::needsSqrt2Scale(tu, compID), width, height, qp.rem(isTransformSkip), maxLog2TrDynamicRange, channelBitDepth, isTransformSkip);
  TrQuantParams trQuantParams;
  trQuantParams.rightShift = (IQUANT_SHIFT - ((isTransformSkip ? 0 : transformShift) + qp.per(isTransformSkip)));
  trQuantParams.qScale = g_invQuantScales[needsSqrt2Scale ? 1 : 0][qp.rem(isTransformSkip)];

  const TCoeff entropyCodingMaximum = (1 << maxLog2TrDynamicRange) - 1;

  uint32_t coeffLevels[3];
  double   coeffLevelError[4];

  CoeffCodingContext cctx(tu, compID, tu.cs->slice->getSignDataHidingEnabledFlag(), dirMode);
  const int sbSizeM1 = (1 << cctx.log2CGSize()) - 1;
  double    baseCost = 0;
  uint32_t  goRiceParam = 0;

  double *costSigSubBlock = m_pdCostCoeffGroupSig;
  memset(costSigSubBlock, 0, (maxNumCoeff >> cctx.log2CGSize()) * sizeof(double));

  const int sbNum = width * height >> cctx.log2CGSize();
  int scanPos;
  coeffGroupRDStats rdStats;

  bool anySigCG = false;

  int maxCtxBins = (cctx.maxNumCoeff() * 7) >> 2;
  cctx.setNumCtxBins(maxCtxBins);

  for (int sbId = 0; sbId < sbNum; sbId++)
  {
    cctx.initSubblock(sbId);

    int noCoeffCoded = 0;
    baseCost = 0.0;
    memset(&rdStats, 0, sizeof(coeffGroupRDStats));
    rdStats.iNumSbbCtxBins = 0;

    for (int scanPosInSB = 0; scanPosInSB <= sbSizeM1; scanPosInSB++)
    {
      int lastPosCoded = sbSizeM1;
      scanPos = cctx.minSubPos() + scanPosInSB;
      //===== quantization =====
      uint32_t blkPos = cctx.blockPos(scanPos);

      const int posX = cctx.posX(scanPos);
      const int posY = cctx.posY(scanPos);
      const int posS = (BdpcmMode::HOR == dirMode) ? posX : posY;
      const ptrdiff_t posNb =
        (BdpcmMode::HOR == dirMode) ? (posX - 1) + posY * coeffs.stride : posX + (posY - 1) * coeffs.stride;
      TCoeff predCoeff = (0 != posS) ? m_fullCoeff[posNb] : 0;

      // set coeff
      const int64_t          tmpLevel = int64_t(abs(srcCoeff[blkPos] - predCoeff)) * quantisationCoefficient;
      const Intermediate_Int levelDouble = (Intermediate_Int)std::min<int64_t>(tmpLevel, std::numeric_limits<Intermediate_Int>::max() - (Intermediate_Int(1) << (qBits - 1)));
      uint32_t roundAbsLevel = std::min<uint32_t>(uint32_t(entropyCodingMaximum), uint32_t((levelDouble + (Intermediate_Int(1) << (qBits - 1))) >> qBits));
      uint32_t minAbsLevel = (roundAbsLevel > 1 ? roundAbsLevel - 1 : 1);

      m_testedLevels = 0;
      coeffLevels[m_testedLevels++] = roundAbsLevel;

      if (minAbsLevel != roundAbsLevel)
      {
        coeffLevels[m_testedLevels++] = minAbsLevel;
      }

      double dErr = double(levelDouble);
      coeffLevelError[0]  = dErr * dErr * errorScale;

      costCoeff0[scanPos] = coeffLevelError[0];
      dstCoeff[blkPos]    = coeffLevels[0];

      //===== coefficient level estimation =====
      unsigned    ctxIdSig = cctx.sigCtxIdAbsTS(scanPos, dstCoeff);
      uint32_t    cLevel;
      const BinFracBits fracBitsPar = fracBits.getFracBitsArray(cctx.parityCtxIdAbsTS());

      goRiceParam = cctx.templateAbsSumTS(scanPos, dstCoeff);
      if (tu.cu->slice->getSPS()->getSpsRangeExtension().getTSRCRicePresentFlag() && tu.mtsIdx[compID] == MtsType::SKIP)
      {
        goRiceParam = goRiceParam + tu.cu->slice->getTsrcIndex();
      }
      unsigned ctxIdSign = cctx.signCtxIdAbsTS(scanPos, dstCoeff, dirMode);
      const BinFracBits fracBitsSign = fracBits.getFracBitsArray(ctxIdSign);
      const uint8_t     sign = srcCoeff[blkPos] - predCoeff < 0 ? 1 : 0;
      unsigned gt1CtxId = cctx.lrg1CtxIdAbsTS(scanPos, dstCoeff, dirMode);
      const BinFracBits fracBitsGr1 = fracBits.getFracBitsArray(gt1CtxId);

      DTRACE_COND((dstCoeff[blkPos] != 0), g_trace_ctx, D_RDOQ_MORE, " uiCtxSig=%d", ctxIdSig);

      const BinFracBits fracBitsSig = fracBits.getFracBitsArray(ctxIdSig);
      bool lastCoeff = false; //
      if (scanPosInSB == lastPosCoded && noCoeffCoded == 0)
      {
        lastCoeff = true;
      }
      int rightPixel, belowPixel;
      cctx.neighTS(rightPixel, belowPixel, scanPos, dstCoeff);
      int numUsedCtxBins = 0;
      cLevel = xGetCodedLevelTSPred(costCoeff[scanPos], costCoeff0[scanPos], costSig[scanPos], levelDouble, qBits, errorScale, coeffLevels, coeffLevelError,
        &fracBitsSig, fracBitsPar, cctx, fracBits, fracBitsSign, fracBitsGr1, sign, rightPixel, belowPixel, goRiceParam, lastCoeff, extendedPrecision, maxLog2TrDynamicRange, numUsedCtxBins);
      cctx.decimateNumCtxBins(numUsedCtxBins);
      rdStats.iNumSbbCtxBins += numUsedCtxBins;
      if (cLevel > 0)
      {
        noCoeffCoded++;
      }
      dstCoeff[blkPos] = cLevel;

      if (sign)
      {
        dstCoeff[blkPos] = -dstCoeff[blkPos];
      }
      xDequantSample( m_fullCoeff[blkPos], dstCoeff[blkPos], trQuantParams );
      m_fullCoeff[blkPos] += predCoeff;

      baseCost += costCoeff[scanPos];
      rdStats.d64SigCost += costSig[scanPos];

      if (dstCoeff[blkPos])
      {
        cctx.setSigGroup();
        rdStats.d64CodedLevelandDist += costCoeff[scanPos] - costSig[scanPos];
        rdStats.d64UncodedDist += costCoeff0[scanPos];
      }
    } //end for (iScanPosinCG)

    if (!cctx.isSigGroup())
    {
      const BinFracBits fracBitsSigGroup = fracBits.getFracBitsArray(cctx.sigGroupCtxId(true));
      baseCost += xGetRateSigCoeffGroup(fracBitsSigGroup, 0) - rdStats.d64SigCost;
      costSigSubBlock[cctx.subSetId()] = xGetRateSigCoeffGroup(fracBitsSigGroup, 0);
      cctx.increaseNumCtxBins(rdStats.iNumSbbCtxBins); // skip sub-block
    }
    else if (sbId != sbNum - 1 || anySigCG)
    {
      // rd-cost if SigCoeffGroupFlag = 0, initialization
      double costZeroSB = baseCost;

      const BinFracBits fracBitsSigGroup = fracBits.getFracBitsArray(cctx.sigGroupCtxId(true));

      baseCost += xGetRateSigCoeffGroup(fracBitsSigGroup, 1);
      costZeroSB += xGetRateSigCoeffGroup(fracBitsSigGroup, 0);
      costSigSubBlock[cctx.subSetId()] = xGetRateSigCoeffGroup(fracBitsSigGroup, 1);

      costZeroSB += rdStats.d64UncodedDist;         // distortion for resetting non-zero levels to zero levels
      costZeroSB -= rdStats.d64CodedLevelandDist;   // distortion and level cost for keeping all non-zero levels
      costZeroSB -= rdStats.d64SigCost;             // sig cost for all coeffs, including zero levels and non-zerl levels

      if (costZeroSB < baseCost)
      {
        cctx.resetSigGroup();
        baseCost = costZeroSB;
        costSigSubBlock[cctx.subSetId()] = xGetRateSigCoeffGroup(fracBitsSigGroup, 0);
        cctx.increaseNumCtxBins(rdStats.iNumSbbCtxBins); // skip sub-block

        for (int scanPosInSB = 0; scanPosInSB <= sbSizeM1; scanPosInSB++)
        {
          scanPos = cctx.minSubPos() + scanPosInSB;
          uint32_t blkPos = cctx.blockPos(scanPos);

          const int posX = cctx.posX(scanPos);
          const int posY = cctx.posY(scanPos);
          const int posS = (BdpcmMode::HOR == dirMode) ? posX : posY;
          const ptrdiff_t posNb =
            (BdpcmMode::HOR == dirMode) ? (posX - 1) + posY * coeffs.stride : posX + (posY - 1) * coeffs.stride;
          m_fullCoeff[scanPos] = (0 != posS) ? m_fullCoeff[posNb] : 0;

          if (dstCoeff[blkPos])
          {
            dstCoeff[blkPos] = 0;
            costCoeff[scanPos] = costCoeff0[scanPos];
            costSig[scanPos] = 0;
          }
        }
      }
      else
      {
        anySigCG = true;
      }
    }
  }

  //===== estimate last position =====
  for (int scanPos = 0; scanPos < maxNumCoeff; scanPos++)
  {
    int blkPos = cctx.blockPos(scanPos);
    TCoeff level = dstCoeff[blkPos];
    absSum += abs(level);
  }
}

void QuantRDOQ::xDequantSample(TCoeff& pRes, TCoeff& coeff, const TrQuantParams& trQuantParams)
{
  // xDequant
  if (trQuantParams.rightShift > 0)
  {
    const Intermediate_Int qAdd = Intermediate_Int(1) << (trQuantParams.rightShift - 1);
    pRes = TCoeff((Intermediate_Int(coeff) * trQuantParams.qScale + qAdd) >> trQuantParams.rightShift);
  }
  else
  {
    pRes = TCoeff((Intermediate_Int(coeff) * trQuantParams.qScale) << -trQuantParams.rightShift);
  }
}

inline uint32_t QuantRDOQ::xGetCodedLevelTSPred(double&            rd64CodedCost,
  double&            rd64CodedCost0,
  double&            rd64CodedCostSig,
  Intermediate_Int    levelDouble,
  int                 qBits,
  double              errorScale,
  uint32_t coeffLevels[],
  double coeffLevelError[],
  const BinFracBits* fracBitsSig,
  const BinFracBits& fracBitsPar,
  CoeffCodingContext& cctx,
  const FracBitsAccess& fracBitsAccess,
  const BinFracBits& fracBitsSign,
  const BinFracBits& fracBitsGt1,
  const uint8_t      sign,
  int                rightPixel,
  int                belowPixel,
  uint16_t           ricePar,
  bool               isLast,
  bool               useLimitedPrefixLength,
  const int          maxLog2TrDynamicRange
  , int&               numUsedCtxBins
) const
{
  double currCostSig = 0;
  uint32_t   bestAbsLevel = 0;
  numUsedCtxBins = 0;
  int numBestCtxBin = 0;
  if (!isLast && coeffLevels[0] < 3)
  {
    if (cctx.numCtxBins() >= 4)
    {
      rd64CodedCostSig = xGetRateSigCoef(*fracBitsSig, 0);
    }
    else
    {
      rd64CodedCostSig = xGetICost(1 << SCALE_BITS);
    }
    rd64CodedCost = rd64CodedCost0 + rd64CodedCostSig;
    if (cctx.numCtxBins() >= 4)
    {
      numUsedCtxBins++;
    }
    if (coeffLevels[0] == 0)
    {
      return bestAbsLevel;
    }
  }
  else
  {
    rd64CodedCost = MAX_DOUBLE;
  }

  if (!isLast)
  {
    if (cctx.numCtxBins() >= 4)
    {
      currCostSig = xGetRateSigCoef(*fracBitsSig, 1);
    }
    else
    {
      currCostSig = xGetICost(1 << SCALE_BITS);
    }
    if (coeffLevels[0] >= 3 && cctx.numCtxBins() >= 4)
    {
      numUsedCtxBins++;
    }
  }

  for (int errorInd = 1; errorInd <= m_testedLevels; errorInd++)
  {
    int absLevel = coeffLevels[errorInd - 1];
    double dErr = 0.0;
    dErr = double(levelDouble - (Intermediate_Int(absLevel) << qBits));
    coeffLevelError[errorInd] = dErr * dErr * errorScale;
    int modAbsLevel = absLevel;
    if (cctx.numCtxBins() >= 4)
    {
      modAbsLevel = cctx.deriveModCoeff(rightPixel, belowPixel, absLevel, m_bdpcm != BdpcmMode::NONE);
    }
    int numCtxBins = 0;
    double dCurrCost = coeffLevelError[errorInd] + xGetICost(xGetICRateTS(modAbsLevel, fracBitsPar, cctx, fracBitsAccess, fracBitsSign, fracBitsGt1, numCtxBins, sign, ricePar, useLimitedPrefixLength, maxLog2TrDynamicRange));

    if (cctx.numCtxBins() >= 4)
      dCurrCost += currCostSig; // if cctx.numCtxBins < 4, xGetICRateTS return rate including sign cost. dont need to add any more

    if (dCurrCost < rd64CodedCost)
    {
      bestAbsLevel = absLevel;
      rd64CodedCost = dCurrCost;
      rd64CodedCostSig = currCostSig;
      numBestCtxBin = numCtxBins;
    }
  }
  numUsedCtxBins += numBestCtxBin;
  return bestAbsLevel;
}

inline int QuantRDOQ::xGetICRateTS( const uint32_t            absLevel,
                                    const BinFracBits&        fracBitsPar,
                                    const CoeffCodingContext& cctx,
                                    const FracBitsAccess&     fracBitsAccess,
                                    const BinFracBits&        fracBitsSign,
                                    const BinFracBits&        fracBitsGt1,
                                    int&                      numCtxBins,
                                    const uint8_t             sign,
                                    const uint16_t            ricePar,
                                    const bool                useLimitedPrefixLength,
                                    const int                 maxLog2TrDynamicRange  ) const
{


  if (cctx.numCtxBins() < 4) // Full by-pass coding
  {
    int rate = absLevel ? (1 << SCALE_BITS) : 0; // 1 bit to signal sign of non-zero

    uint32_t symbol = absLevel;

    uint32_t length;
    const int threshold = COEF_REMAIN_BIN_REDUCTION;
    if (symbol < (threshold << ricePar))
    {
      length = symbol >> ricePar;
      rate += (length + 1 + ricePar) << SCALE_BITS;
    }
    else if (useLimitedPrefixLength)
    {
      const uint32_t maximumPrefixLength = (32 - (COEF_REMAIN_BIN_REDUCTION + maxLog2TrDynamicRange));

      uint32_t prefixLength = 0;
      uint32_t suffix = (symbol >> ricePar) - COEF_REMAIN_BIN_REDUCTION;

      while ((prefixLength < maximumPrefixLength) && (suffix > ((2 << prefixLength) - 2)))
      {
        prefixLength++;
      }

      const uint32_t suffixLength = (prefixLength == maximumPrefixLength) ? (maxLog2TrDynamicRange - ricePar) : (prefixLength + 1/*separator*/);

      rate += (COEF_REMAIN_BIN_REDUCTION + prefixLength + suffixLength + ricePar) << SCALE_BITS;
    }
    else
    {
      length = ricePar;
      symbol = symbol - (threshold << ricePar);
      while (symbol >= (1 << length))
      {
        symbol -= (1 << (length++));
      }
      rate += (threshold + length + 1 - ricePar + length) << SCALE_BITS;
    }

    return rate;
  }

  else if (cctx.numCtxBins() >= 4 && cctx.numCtxBins() < 8) // First pass context coding and all by-pass coding ( Sign flag is not counted here)
  {
    int rate = fracBitsSign.intBits[sign]; // sign bits
    if (absLevel)
    {
      numCtxBins++;
    }

    if (absLevel > 1)
    {
      rate += fracBitsGt1.intBits[1];
      rate += fracBitsPar.intBits[(absLevel - 2) & 1];

      numCtxBins += 2;

      int cutoffVal = 2;

      if (absLevel >= cutoffVal)
      {
        uint32_t symbol = (absLevel - cutoffVal) >> 1;
        uint32_t length;
        const int threshold = COEF_REMAIN_BIN_REDUCTION;
        if (symbol < (threshold << ricePar))
        {
          length = symbol >> ricePar;
          rate += (length + 1 + ricePar) << SCALE_BITS;
        }
        else if (useLimitedPrefixLength)
        {
          const uint32_t maximumPrefixLength = (32 - (COEF_REMAIN_BIN_REDUCTION + maxLog2TrDynamicRange));

          uint32_t prefixLength = 0;
          uint32_t suffix = (symbol >> ricePar) - COEF_REMAIN_BIN_REDUCTION;

          while ((prefixLength < maximumPrefixLength) && (suffix > ((2 << prefixLength) - 2)))
          {
            prefixLength++;
          }

          const uint32_t suffixLength = (prefixLength == maximumPrefixLength) ? (maxLog2TrDynamicRange - ricePar) : (prefixLength + 1/*separator*/);

          rate += (COEF_REMAIN_BIN_REDUCTION + prefixLength + suffixLength + ricePar) << SCALE_BITS;
        }
        else
        {
          length = ricePar;
          symbol = symbol - (threshold << ricePar);
          while (symbol >= (1 << length))
          {
            symbol -= (1 << (length++));
          }
          rate += (threshold + length + 1 - ricePar + length) << SCALE_BITS;
        }
      }
    }
    else if (absLevel == 1)
    {
      rate += fracBitsGt1.intBits[0];
      numCtxBins++;
    }
    else
    {
      rate = 0;
    }
    return rate;
  }

  int rate = fracBitsSign.intBits[sign];

  if (absLevel)
  {
    numCtxBins++;
  }

  if( absLevel > 1 )
  {
    rate += fracBitsGt1.intBits[1];
    rate += fracBitsPar.intBits[( absLevel - 2 ) & 1];
    numCtxBins += 2;

          int cutoffVal = 2;
    const int numGtBins = 4;
    for( int i = 0; i < numGtBins; i++ )
    {
      if( absLevel >= cutoffVal )
      {
        const uint16_t ctxGtX = cctx.greaterXCtxIdAbsTS( cutoffVal>>1 );
        const BinFracBits &fracBitsGtX = fracBitsAccess.getFracBitsArray( ctxGtX );
        unsigned gtX = ( absLevel >= ( cutoffVal + 2 ) );
        rate += fracBitsGtX.intBits[gtX];
        numCtxBins++;
      }
      cutoffVal += 2;
    }

    if( absLevel >= cutoffVal )
    {
      uint32_t symbol = ( absLevel - cutoffVal ) >> 1;
      uint32_t length;
      const int threshold = COEF_REMAIN_BIN_REDUCTION;
      if( symbol < ( threshold << ricePar ) )
      {
        length = symbol >> ricePar;
        rate  += ( length + 1 + ricePar ) << SCALE_BITS;
      }
      else if( useLimitedPrefixLength )
      {
        const uint32_t maximumPrefixLength = ( 32 - ( COEF_REMAIN_BIN_REDUCTION + maxLog2TrDynamicRange ) );

        uint32_t prefixLength = 0;
        uint32_t suffix = ( symbol >> ricePar ) - COEF_REMAIN_BIN_REDUCTION;

        while( ( prefixLength < maximumPrefixLength ) && ( suffix > ( ( 2 << prefixLength ) - 2 ) ) )
        {
          prefixLength++;
        }

        const uint32_t suffixLength = ( prefixLength == maximumPrefixLength ) ? ( maxLog2TrDynamicRange - ricePar ) : ( prefixLength + 1/*separator*/ );

        rate += ( COEF_REMAIN_BIN_REDUCTION + prefixLength + suffixLength + ricePar ) << SCALE_BITS;
      }
      else
      {
        length = ricePar;
        symbol = symbol - ( threshold << ricePar );
        while( symbol >= ( 1 << length ) )
        {
          symbol -= ( 1 << ( length++ ) );
        }
        rate += ( threshold + length + 1 - ricePar + length ) << SCALE_BITS;
      }
    }
  }
  else if( absLevel == 1 )
  {
    rate += fracBitsGt1.intBits[0];
    numCtxBins++;
  }
  else
  {
    rate = 0;
  }
  return rate;
}

//! \}
